Data manipulation with polars

Author

Tony Duan

Polars is a DataFrame interface on top of an OLAP Query Engine implemented in Rust using Apache Arrow Columnar Format as the memory model.

Lazy | eager execution
Multi-threaded
SIMD
Query optimization
Powerful expression API
Hybrid Streaming (larger-than-RAM datasets)
Rust | Python | NodeJS | R | …

Code

#Sys.setenv(RETICULATE_PYTHON = "/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")
library(reticulate)
#use_python("/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")
py_require(c("polars", "pandas", "numpy", "matplotlib", "duckdb","datetime","seaborn","pyarrow"))

1 load package

Code

# Import the pandas library for data manipulation
import pandas as pd
# Import numpy for numerical operations
import numpy as np
# Import matplotlib.pylab for plotting
import matplotlib.pylab as plt
# Import seaborn for statistical data visualization
import seaborn as sns
# Import polars for DataFrame operations
import polars as pl
# Import col and when from polars for expression building
from polars import col, when
# Import datetime for date and time operations
from datetime import datetime
# Import duckdb for in-memory SQL database operations
import duckdb

2 import data from csv

Code

# Read the CSV file './data/mtcars.csv' into a Polars DataFrame
mtcars = pl.read_csv("./data/mtcars.csv")

3 import data from pandas

Code

# Read the CSV file into a pandas DataFrame
mtcars_pd= pd.read_csv("./data/mtcars.csv")
# Print the type of mtcars_pd
type(mtcars_pd)

<class 'pandas.core.frame.DataFrame'>

Code

# Convert the pandas DataFrame to a Polars DataFrame
mtcars = pl.from_pandas(mtcars_pd)
# Print the type of mtcars
type(mtcars)

<class 'polars.dataframe.frame.DataFrame'>

Code

# Select the 'cyl', 'mpg', and 'hp' columns from the mtcars DataFrame and take the first 5 rows
small_mtcars = mtcars.select(["cyl", "mpg","hp"]).head(5)

4 select column

4.1 get column names

Code

# Get the column names of the small_mtcars DataFrame
small_mtcars.columns

['cyl', 'mpg', 'hp']

4.2 select columns by name

Code

# Select the 'cyl' and 'mpg' columns from the small_mtcars DataFrame
small_mtcars.select(["cyl", "mpg"])

shape: (5, 2)

cyl	mpg
i64	f64
6	21.0
6	21.0
4	22.8
6	21.4
8	18.7

4.3 select columns by name match with ‘p’

Code

# Select columns with 'p' in the name
cols_with_p = [col for col in small_mtcars.columns if 'p' in col]

# Select the columns identified in cols_with_p from the small_mtcars DataFrame
small_mtcars.select(cols_with_p)

shape: (5, 2)

mpg	hp
f64	i64
21.0	110
21.0	110
22.8	93
21.4	110
18.7	175

4.4 select columns by index

4.4.1 select first and 3rd columns

Code

# Select the first (index 0) and third (index 2) columns of the small_mtcars DataFrame
small_mtcars.select([small_mtcars.columns[0], small_mtcars.columns[2]])

shape: (5, 2)

cyl	hp
i64	i64
6	110
6	110
4	93
6	110
8	175

4.4.2 select first to 3rd columns

Code

# Select columns from index 0 up to (but not including) index 3
small_mtcars.select(small_mtcars.columns[0:3])

shape: (5, 3)

cyl	mpg	hp
i64	f64	i64
6	21.0	110
6	21.0	110
4	22.8	93
6	21.4	110
8	18.7	175

5 drop column

Code

# Drop the column named "cyl" from the small_mtcars DataFrame
small_mtcars.drop("cyl")

shape: (5, 2)

mpg	hp
f64	i64
21.0	110
21.0	110
22.8	93
21.4	110
18.7	175

6 Renaming column

Code

# Rename the column 'mpg' to 'new_name_mpg'
small_mtcars.rename({
    "mpg": "new_name_mpg"
})

shape: (5, 3)

cyl	new_name_mpg	hp
i64	f64	i64
6	21.0	110
6	21.0	110
4	22.8	93
6	21.4	110
8	18.7	175

7 Create column

7.1 Mutate

Code

# Apply transformations to the mtcars DataFrame
result = (
    mtcars.head()
    .with_columns([
        # Create a new column 'gear2' by adding 1 to the 'gear' column
        (col("gear") + 1).alias("gear2")
        
        # Create a new column 'gear3': if 'gear' > 3 then 'long', else 'short'
        ,when(col("gear") > 3).then(pl.lit("long"))
        .otherwise(pl.lit("short")).alias("gear3")
        
        # Create a new column 'qsec2' with conditional logic based on 'qsec' values
        ,when(col("qsec") <= 17).then(pl.lit("short"))
        .when(col("qsec") <= 18).then(pl.lit("Medium"))
        .otherwise(pl.lit("long")).alias("qsec2")

    ])
)

# Print the resulting DataFrame
print(result)

shape: (5, 15)
┌───────────────────┬──────┬─────┬───────┬───┬──────┬───────┬───────┬────────┐
│ Unnamed: 0        ┆ mpg  ┆ cyl ┆ disp  ┆ … ┆ carb ┆ gear2 ┆ gear3 ┆ qsec2  │
│ ---               ┆ ---  ┆ --- ┆ ---   ┆   ┆ ---  ┆ ---   ┆ ---   ┆ ---    │
│ str               ┆ f64  ┆ i64 ┆ f64   ┆   ┆ i64  ┆ i64   ┆ str   ┆ str    │
╞═══════════════════╪══════╪═════╪═══════╪═══╪══════╪═══════╪═══════╪════════╡
│ Mazda RX4         ┆ 21.0 ┆ 6   ┆ 160.0 ┆ … ┆ 4    ┆ 5     ┆ long  ┆ short  │
│ Mazda RX4 Wag     ┆ 21.0 ┆ 6   ┆ 160.0 ┆ … ┆ 4    ┆ 5     ┆ long  ┆ Medium │
│ Datsun 710        ┆ 22.8 ┆ 4   ┆ 108.0 ┆ … ┆ 1    ┆ 5     ┆ long  ┆ long   │
│ Hornet 4 Drive    ┆ 21.4 ┆ 6   ┆ 258.0 ┆ … ┆ 1    ┆ 4     ┆ short ┆ long   │
│ Hornet Sportabout ┆ 18.7 ┆ 8   ┆ 360.0 ┆ … ┆ 2    ┆ 4     ┆ short ┆ Medium │
└───────────────────┴──────┴─────┴───────┴───┴──────┴───────┴───────┴────────┘

7.2 Transmute,create column and only keep this column

Code

# Select the 'gear' column from the first few rows of mtcars, add 1 to it, and alias it as 'gear2'
mtcars.head().select([
        (col("gear") + 1).alias("gear2")
    ])

shape: (5, 1)

gear2
i64
5
5
5
4
4

8 Filter rows

Code

# Filter rows where the 'gear' column is equal to 4
mtcars.filter(
   (col("gear") == 4)
)

shape: (12, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Mazda RX4"	21.0	6	160.0	110	3.9	2.62	16.46	0	1	4	4
"Mazda RX4 Wag"	21.0	6	160.0	110	3.9	2.875	17.02	0	1	4	4
"Datsun 710"	22.8	4	108.0	93	3.85	2.32	18.61	1	1	4	1
"Merc 240D"	24.4	4	146.7	62	3.69	3.19	20.0	1	0	4	2
"Merc 230"	22.8	4	140.8	95	3.92	3.15	22.9	1	0	4	2
…	…	…	…	…	…	…	…	…	…	…	…
"Fiat 128"	32.4	4	78.7	66	4.08	2.2	19.47	1	1	4	1
"Honda Civic"	30.4	4	75.7	52	4.93	1.615	18.52	1	1	4	2
"Toyota Corolla"	33.9	4	71.1	65	4.22	1.835	19.9	1	1	4	1
"Fiat X1-9"	27.3	4	79.0	66	4.08	1.935	18.9	1	1	4	1
"Volvo 142E"	21.4	4	121.0	109	4.11	2.78	18.6	1	1	4	2

8.1 Filters with AND conditions

Code

# Filter rows where 'cyl' is greater than 4 AND 'gear' is equal to 5
mtcars.filter(
    (col("cyl") > 4) & (col("gear") == 5)
)

shape: (3, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Ford Pantera L"	15.8	8	351.0	264	4.22	3.17	14.5	0	1	5	4
"Ferrari Dino"	19.7	6	145.0	175	3.62	2.77	15.5	0	1	5	6
"Maserati Bora"	15.0	8	301.0	335	3.54	3.57	14.6	0	1	5	8

8.2 Filters with OR conditions

Code

# Filter rows where 'cyl' is equal to 6 OR 'gear' is equal to 5
mtcars.filter(
    (col("cyl") == 6) | (col("gear") == 5)
)

shape: (11, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Mazda RX4"	21.0	6	160.0	110	3.9	2.62	16.46	0	1	4	4
"Mazda RX4 Wag"	21.0	6	160.0	110	3.9	2.875	17.02	0	1	4	4
"Hornet 4 Drive"	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
"Valiant"	18.1	6	225.0	105	2.76	3.46	20.22	1	0	3	1
"Merc 280"	19.2	6	167.6	123	3.92	3.44	18.3	1	0	4	4
…	…	…	…	…	…	…	…	…	…	…	…
"Porsche 914-2"	26.0	4	120.3	91	4.43	2.14	16.7	0	1	5	2
"Lotus Europa"	30.4	4	95.1	113	3.77	1.513	16.9	1	1	5	2
"Ford Pantera L"	15.8	8	351.0	264	4.22	3.17	14.5	0	1	5	4
"Ferrari Dino"	19.7	6	145.0	175	3.62	2.77	15.5	0	1	5	6
"Maserati Bora"	15.0	8	301.0	335	3.54	3.57	14.6	0	1	5	8

8.3 filter row with index

8.3.1 first 3

Code

# Select the first 3 rows of the small_mtcars DataFrame
small_mtcars.head(3)

shape: (3, 3)

cyl	mpg	hp
i64	f64	i64
6	21.0	110
6	21.0	110
4	22.8	93

8.3.2 last 3

Code

# Select the last 3 rows of the small_mtcars DataFrame
small_mtcars.tail(3)

shape: (3, 3)

cyl	mpg	hp
i64	f64	i64
4	22.8	93
6	21.4	110
8	18.7	175

8.3.3 5th rows

Code

# Select the row at index 4 (which is the 5th row)
mtcars[4]

shape: (1, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Hornet Sportabout"	18.7	8	360.0	175	3.15	3.44	17.02	0	0	3	2

8.3.4 1 and 5th rows

Code

# Select rows at index 0 (1st row) and 4 (5th row)
mtcars[[0, 4]]

shape: (2, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Mazda RX4"	21.0	6	160.0	110	3.9	2.62	16.46	0	1	4	4
"Hornet Sportabout"	18.7	8	360.0	175	3.15	3.44	17.02	0	0	3	2

8.3.5 get 1 to 4 rows

Code

# Select rows from index 0 up to (but not including) index 4
mtcars[0:4]

shape: (4, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Mazda RX4"	21.0	6	160.0	110	3.9	2.62	16.46	0	1	4	4
"Mazda RX4 Wag"	21.0	6	160.0	110	3.9	2.875	17.02	0	1	4	4
"Datsun 710"	22.8	4	108.0	93	3.85	2.32	18.61	1	1	4	1
"Hornet 4 Drive"	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1

8.3.6 get ramdon 5 rows

Code

# Select 5 random rows from the mtcars DataFrame, with a fixed seed for reproducibility
mtcars.sample(n=5, seed=42)

shape: (5, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Hornet Sportabout"	18.7	8	360.0	175	3.15	3.44	17.02	0	0	3	2
"Ferrari Dino"	19.7	6	145.0	175	3.62	2.77	15.5	0	1	5	6
"Merc 230"	22.8	4	140.8	95	3.92	3.15	22.9	1	0	4	2
"AMC Javelin"	15.2	8	304.0	150	3.15	3.435	17.3	0	0	3	2
"Fiat 128"	32.4	4	78.7	66	4.08	2.2	19.47	1	1	4	1

9 Append

9.1 append by row

Code

# Select rows from index 0 up to (but not including) index 4
data1 = mtcars[0:4]
# Select rows from index 10 up to (but not including) index 11
data2 = mtcars[10:11]

# Concatenate data1 and data2 DataFrames vertically
pl.concat([data1, data2], how="vertical")

shape: (5, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Mazda RX4"	21.0	6	160.0	110	3.9	2.62	16.46	0	1	4	4
"Mazda RX4 Wag"	21.0	6	160.0	110	3.9	2.875	17.02	0	1	4	4
"Datsun 710"	22.8	4	108.0	93	3.85	2.32	18.61	1	1	4	1
"Hornet 4 Drive"	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
"Merc 280C"	17.8	6	167.6	123	3.92	3.44	18.9	1	0	4	4

9.2 append by column

Code

# Select the 'mpg' column from small_mtcars
data1 = small_mtcars.select("mpg")
# Select the 'cyl' column from small_mtcars
data2 = small_mtcars.select("cyl")

# Concatenate data1 and data2 DataFrames horizontally
pl.concat([data1, data2], how="horizontal")

shape: (5, 2)

mpg	cyl
f64	i64
21.0	6
21.0	6
22.8	4
21.4	6
18.7	8

9.3 Dropping NA values

Code

# Drop all rows that contain any null values
mtcars.drop_nulls()

shape: (32, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Mazda RX4"	21.0	6	160.0	110	3.9	2.62	16.46	0	1	4	4
"Mazda RX4 Wag"	21.0	6	160.0	110	3.9	2.875	17.02	0	1	4	4
"Datsun 710"	22.8	4	108.0	93	3.85	2.32	18.61	1	1	4	1
"Hornet 4 Drive"	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
"Hornet Sportabout"	18.7	8	360.0	175	3.15	3.44	17.02	0	0	3	2
…	…	…	…	…	…	…	…	…	…	…	…
"Lotus Europa"	30.4	4	95.1	113	3.77	1.513	16.9	1	1	5	2
"Ford Pantera L"	15.8	8	351.0	264	4.22	3.17	14.5	0	1	5	4
"Ferrari Dino"	19.7	6	145.0	175	3.62	2.77	15.5	0	1	5	6
"Maserati Bora"	15.0	8	301.0	335	3.54	3.57	14.6	0	1	5	8
"Volvo 142E"	21.4	4	121.0	109	4.11	2.78	18.6	1	1	4	2

9.4 To drop rows with NAs in specific columns:

Code

# Drop rows that have null values in either the "mpg" or "disp" columns
mtcars.drop_nulls(subset=["mpg", "disp"])

shape: (32, 12)

Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Mazda RX4"	21.0	6	160.0	110	3.9	2.62	16.46	0	1	4	4
"Mazda RX4 Wag"	21.0	6	160.0	110	3.9	2.875	17.02	0	1	4	4
"Datsun 710"	22.8	4	108.0	93	3.85	2.32	18.61	1	1	4	1
"Hornet 4 Drive"	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
"Hornet Sportabout"	18.7	8	360.0	175	3.15	3.44	17.02	0	0	3	2
…	…	…	…	…	…	…	…	…	…	…	…
"Lotus Europa"	30.4	4	95.1	113	3.77	1.513	16.9	1	1	5	2
"Ford Pantera L"	15.8	8	351.0	264	4.22	3.17	14.5	0	1	5	4
"Ferrari Dino"	19.7	6	145.0	175	3.62	2.77	15.5	0	1	5	6
"Maserati Bora"	15.0	8	301.0	335	3.54	3.57	14.6	0	1	5	8
"Volvo 142E"	21.4	4	121.0	109	4.11	2.78	18.6	1	1	4	2

9.5 keep NA values

10 group by

10.1 average,min,max,sum

Code

# Group the mtcars DataFrame by the 'cyl' column and aggregate various statistics
tbl_query = (
    mtcars
    .group_by("cyl")
    .agg([
        pl.col("hp").mean().alias("avg_hp"),  # Calculate the mean of 'hp' and alias it as 'avg_hp'
        pl.col("hp").min().alias("min_hp"),    # Calculate the minimum of 'hp' and alias it as 'min_hp'
        pl.col("hp").max().alias("max_hp"),    # Calculate the maximum of 'hp' and alias it as 'max_hp'
        pl.col("disp").sum().alias("totol_disp") # Calculate the sum of 'disp' and alias it as 'totol_disp'
    ])
)

# Print the resulting aggregated DataFrame
tbl_query

shape: (3, 5)

cyl	avg_hp	min_hp	max_hp	totol_disp
i64	f64	i64	i64	f64
6	122.285714	105	175	1283.2
8	209.214286	150	335	4943.4
4	82.636364	52	113	1156.5

10.2 count record and count distinct record

Code

# Group the mtcars DataFrame by the 'cyl' column and count the number of rows in each group
mtcars.group_by("cyl").agg(pl.count().alias("n"))

shape: (3, 2)

cyl	n
i64	u32
6	7
8	14
4	11

Code

# Group the mtcars DataFrame by the 'cyl' column and count the number of unique 'hp' values for each group
mtcars.group_by("cyl").agg(pl.col("hp").n_unique().alias("n"))

shape: (3, 2)

cyl	n
i64	u32
4	10
6	4
8	9

11 order rows

Code

# Sort the small_mtcars DataFrame by the 'hp' column in ascending order
small_mtcars.sort("hp")

shape: (5, 3)

cyl	mpg	hp
i64	f64	i64
4	22.8	93
6	21.0	110
6	21.0	110
6	21.4	110
8	18.7	175

11.1 Sort in descending order

Code

# Sort the small_mtcars DataFrame by the 'hp' column in descending order
small_mtcars.sort("hp", descending=True)

shape: (5, 3)

cyl	mpg	hp
i64	f64	i64
8	18.7	175
6	21.0	110
6	21.0	110
6	21.4	110
4	22.8	93

11.2 Arrange by multiple variables

Code

# Sort the small_mtcars DataFrame by 'cyl' in ascending order and 'mpg' in descending order
small_mtcars.sort(
    by=["cyl", "mpg"],
    descending=[False, True]  # True means descending
)

shape: (5, 3)

cyl	mpg	hp
i64	f64	i64
4	22.8	93
6	21.4	110
6	21.0	110
6	21.0	110
8	18.7	175

12 join

Code

# Create a Polars DataFrame named lhs
lhs = pl.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})
# Create a Polars DataFrame named rhs
rhs = pl.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})

Code

# Display the lhs DataFrame
lhs

shape: (3, 2)

id	val
i64	str
1	"lhs.1"
2	"lhs.2"
3	"lhs.3"

Code

# Display the rhs DataFrame
rhs

shape: (3, 2)

id	val
i64	str
1	"rhs.1"
2	"rhs.2"
4	"rhs.3"

12.1 inner_join

Code

# Perform an inner join of lhs and rhs DataFrames on the 'id' column
result = lhs.join(rhs, on="id", how="inner")
# Display the result
result

shape: (2, 3)

id	val	val_right
i64	str	str
1	"lhs.1"	"rhs.1"
2	"lhs.2"	"rhs.2"

12.2 full join

Code

# Perform a full outer join of lhs and rhs DataFrames on the 'id' column
result = lhs.join(rhs, on="id", how="outer")
# Display the result
result

shape: (4, 4)

id	val	id_right	val_right
i64	str	i64	str
1	"lhs.1"	1	"rhs.1"
2	"lhs.2"	2	"rhs.2"
null	null	4	"rhs.3"
3	"lhs.3"	null	null

12.3 left join

Code

# Perform a left join of lhs and rhs DataFrames on the 'id' column
result = lhs.join(rhs, on="id", how="left")
# Display the result
result

shape: (3, 3)

id	val	val_right
i64	str	str
1	"lhs.1"	"rhs.1"
2	"lhs.2"	"rhs.2"
3	"lhs.3"	null

12.4 anti join

keep data in left which not in right

Code

# Perform an anti-join: keep rows from lhs that do not have a match in rhs based on 'id'
result = lhs.join(rhs, on="id", how="anti")
# Display the result
result

shape: (1, 2)

id	val
i64	str
3	"lhs.3"

keep data in right which not in left

Code

# Perform an anti-join: keep rows from rhs that do not have a match in lhs based on 'id'
result = rhs.join(lhs, on="id", how="anti")
# Display the result
result

shape: (1, 2)

id	val
i64	str
4	"rhs.3"

13 Reshape tables

Code

# Create a Polars DataFrame named costs
costs = pl.DataFrame({
    "id": [1, 2],
    "price_x": [0.1, 0.2],
    "price_y": [0.4, 0.5],
    "price_z": [0.7, 0.8],
})

# Display the DataFrame
costs

shape: (2, 4)

id	price_x	price_y	price_z
i64	f64	f64	f64
1	0.1	0.4	0.7
2	0.2	0.5	0.8

13.1 Gather data long(wide to long)

Below 3 method will give same result

Code

# Unpivot (melt) the costs DataFrame from wide to long format
costs_long = costs.unpivot(
    index="id",                # Columns to keep as identifiers
    on=["price_x", "price_y", "price_z"],  # Columns to melt
    variable_name="type",     # Name for the melted column names
    value_name="price"        # Name for the melted values
)

# Display the long format DataFrame
costs_long

shape: (6, 3)

id	type	price
i64	str	f64
1	"price_x"	0.1
2	"price_x"	0.2
1	"price_y"	0.4
2	"price_y"	0.5
1	"price_z"	0.7
2	"price_z"	0.8

13.2 Spread data wide(long to wide)

Code

# Pivot the costs_long DataFrame from long to wide format
costs_wide = costs_long.pivot(
    values="price",
    index="id",
    on="type"  
)

# Display the wide format DataFrame
costs_wide

shape: (2, 4)

id	price_x	price_y	price_z
i64	f64	f64	f64
1	0.1	0.4	0.7
2	0.2	0.5	0.8

14 string

Code

# Create a Polars DataFrame named df
df = pl.DataFrame({
    "text": ["abc", "DDD", "1243c", "aeEe"],
    "num": [3, 4, 7, 8]
})
# Display the DataFrame
df

shape: (4, 2)

text	num
str	i64
"abc"	3
"DDD"	4
"1243c"	7
"aeEe"	8

14.1 upper case

Code

# Add a new column 'text_new' with the uppercase version of the 'text' column
df.with_columns(
    pl.col("text").str.to_uppercase().alias("text_new")
)

shape: (4, 3)

text	num	text_new
str	i64	str
"abc"	3	"ABC"
"DDD"	4	"DDD"
"1243c"	7	"1243C"
"aeEe"	8	"AEEE"

14.2 lower case

Code

# Add a new column 'text_new' with the lowercase version of the 'text' column
df.with_columns(
    pl.col("text").str.to_lowercase().alias("text_new")
)

shape: (4, 3)

text	num	text_new
str	i64	str
"abc"	3	"abc"
"DDD"	4	"ddd"
"1243c"	7	"1243c"
"aeEe"	8	"aeee"

14.3 match

Code

# Add multiple new columns based on string matching conditions
df.with_columns([
    # 'text_new1': 'T' if 'text' is exactly 'abc', else 'F'
    pl.when(pl.col("text") == "abc").then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new1"),
    # 'text_new2': 'T' if 'text' starts with 'a', else 'F'
    pl.when(pl.col("text").str.starts_with("a")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new2"),
    # 'text_new3': 'T' if 'text' ends with 'c', else 'F'
    pl.when(pl.col("text").str.ends_with("c")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new3"),
    # 'text_new4': 'T' if 'text' contains '4', else 'F'
    pl.when(pl.col("text").str.contains("4")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new4"),
])

shape: (4, 6)

text	num	text_new1	text_new2	text_new3	text_new4
str	i64	str	str	str	str
"abc"	3	"T"	"T"	"T"	"F"
"DDD"	4	"F"	"F"	"F"	"F"
"1243c"	7	"F"	"F"	"T"	"T"
"aeEe"	8	"F"	"T"	"F"	"F"

14.4 concatenation

Code

# Add a new column 'text_new1' by concatenating the 'text' column with itself, separated by " is "
df.with_columns(
    (pl.col("text") + " is " + pl.col("text")).alias("text_new1")
)

shape: (4, 3)

text	num	text_new1
str	i64	str
"abc"	3	"abc is abc"
"DDD"	4	"DDD is DDD"
"1243c"	7	"1243c is 1243c"
"aeEe"	8	"aeEe is aeEe"

14.5 replace

Use .str.replace(…, regex=True) with regular expressions to replace patterns in strings.

For example, the code below uses “a.”, where . is called a wildcard–which matches any character.

Code

# Add a new column 'text_new1' by replacing patterns in the 'text' column using a regular expression
df.with_columns(
    pl.col("text").str.replace(r"a.", "XX").alias("text_new1")
)

shape: (4, 3)

text	num	text_new1
str	i64	str
"abc"	3	"XXc"
"DDD"	4	"DDD"
"1243c"	7	"1243c"
"aeEe"	8	"XXEe"

14.6 extract

Use str.extract() with a regular expression to pull out a matching piece of text.

text_new1 get rid of first ‘a’

text_new1 get rid of last ‘c’

Code

# Add new columns by extracting substrings from the 'text' column using regular expressions
df.with_columns([
    # 'text_new1': extract everything after the first 'a'
    pl.col("text").str.extract(r"a(.*)", 1).alias("text_new1"),
    # 'text_new2': extract everything before the last 'c'
    pl.col("text").str.extract(r"(.*)c", 1).alias("text_new2")
])

shape: (4, 4)

text	num	text_new1	text_new2
str	i64	str	str
"abc"	3	"bc"	"ab"
"DDD"	4	null	null
"1243c"	7	null	"1243"
"aeEe"	8	"eEe"	null

15 date

Polars provides robust functionalities for handling date and time data, including parsing, extracting components, and formatting.

Code

# Create a Polars DataFrame with 'dates' and 'times' columns containing string representations of dates and times
df_dates = pl.DataFrame({
    "dates": ["2023-04-05", "2024-05-06"],
    "times": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"]
})

# Display the DataFrame
df_dates

shape: (2, 2)

dates	times
str	str
"2023-04-05"	"2023-04-05 06:07:08"
"2024-05-06"	"2024-05-06 07:08:09"

15.1 Extracting Date Components

You can extract various components like year, month, day, hour, minute, second from datetime objects.

# Convert 'dates' to date type and extract year, month, day
df_dates.with_columns([
    pl.col("dates").str.to_date().dt.year().alias("year"),
    pl.col("dates").str.to_date().dt.month().alias("month"),
    pl.col("dates").str.to_date().dt.day().alias("day"),
])

15.2 Formatting Dates

Dates can be formatted into different string representations using strftime().

# Convert 'times' to datetime type and format it as YYYY-MM-DD HH:MM:SS
df_dates.with_columns(
    pl.col("times").str.to_datetime().dt.strftime("%Y-%m-%d %H:%M:%S").alias("formatted_time")
)

Code

# Add new columns by converting string columns to datetime and date objects
df_dates2 = df_dates.with_columns(
    # Convert 'times' column to datetime objects and alias as 'real_times'
    pl.col("times").str.to_datetime().alias("real_times"),
    # Convert 'dates' column to date objects and alias as 'real_dates'
    pl.col("dates").str.to_date().alias("real_dates")
)
# Display the DataFrame with new columns
df_dates2

shape: (2, 4)

dates	times	real_times	real_dates
str	str	datetime[μs]	date
"2023-04-05"	"2023-04-05 06:07:08"	2023-04-05 06:07:08	2023-04-05
"2024-05-06"	"2024-05-06 07:08:09"	2024-05-06 07:08:09	2024-05-06

Code

# Generate descriptive statistics for the df_dates2 DataFrame
df_dates2.describe()

shape: (9, 5)

statistic	dates	times	real_times	real_dates
str	str	str	str	str
"count"	"2"	"2"	"2"	"2"
"null_count"	"0"	"0"	"0"	"0"
"mean"	null	null	"2023-10-20 18:37:38.500000"	"2023-10-20 12:00:00"
"std"	null	null	null	null
"min"	"2023-04-05"	"2023-04-05 06:07:08"	"2023-04-05 06:07:08"	"2023-04-05"
"25%"	null	null	"2023-04-05 06:07:08"	"2023-04-05"
"50%"	null	null	"2024-05-06 07:08:09"	"2024-05-06"
"75%"	null	null	"2024-05-06 07:08:09"	"2024-05-06"
"max"	"2024-05-06"	"2024-05-06 07:08:09"	"2024-05-06 07:08:09"	"2024-05-06"

16 using polars with database

16.1 set up a DuckDB database, with an mtcars table.

Code

# Connect to DuckDB (in-memory by default)
con = duckdb.connect()

# Sample mtcars data as a list of tuples
mtcars_data = [
    (21.0, 6, 160, 110, 3.90, 2.62, 16.46, 0, 1, 4, 4),
    (21.0, 6, 160, 110, 3.90, 2.875, 17.02, 0, 1, 4, 4),
    (22.8, 4, 108, 93, 3.85, 2.320, 18.61, 1, 1, 4, 1),
    (21.4, 6, 258, 110, 3.08, 3.215, 19.44, 1, 0, 3, 1),
]

# Define column names for the DataFrame
columns = [
    "mpg", "cyl", "disp", "hp", "drat", "wt", "qsec",
    "vs", "am", "gear", "carb"
]

# Create Polars DataFrame from the sample data and column names
df = pl.DataFrame(mtcars_data, schema=columns)

16.2 create table in duckdb

Code

# Register the Polars DataFrame 'df' as a table named 'mtcars' in the DuckDB connection
con.register("mtcars", df)

<duckdb.duckdb.DuckDBPyConnection object at 0x12a871470>

16.3 show tables in duckdb

Code

# Execute an SQL query to show all tables and fetch the results
con.execute("SHOW TABLES").fetchall()

[('mtcars',)]

16.4 create query

Code

# Define an SQL query to select 'mpg', 'cyl', and 'hp' from the 'mtcars' table where 'cyl' is 6
query = "SELECT mpg, cyl, hp FROM mtcars WHERE cyl = 6"

16.5 get data

Code

# Execute the SQL query and retrieve the result as a Polars DataFrame
result = con.execute(query).pl()
# Display the result
result

shape: (3, 3)

mpg	cyl	hp
f64	i64	i64
21.0	6	110
21.0	6	110
21.4	6	110

16.6 LazyFrame

Code

# Execute the SQL query and retrieve the result as a Polars LazyFrame
lazy_df = con.execute(query).pl().lazy()
# Display the LazyFrame
lazy_df

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

DF ["mpg", "cyl", "hp"]; PROJECT */3 COLUMNS

16.7 Collect to DataFrame

because lazy expressions,the collect function is actually running the sql.

Code

# Collect the LazyFrame, executing the query and returning a Polars DataFrame
result_df = lazy_df.collect()
# Display the resulting DataFrame
result_df

shape: (3, 3)

mpg	cyl	hp
f64	i64	i64
21.0	6	110
21.0	6	110
21.4	6	110

17 reference:

https://docs.pola.rs/api/python/stable/reference/index.html